Concrete is the most important material in civil engineering. The concrete compressive strength is a highly nonlinear function of age and ingredients. These ingredients include cement, blast furnace slag, fly superplasticizer, coarse aggregate, and fine aggregate.
The purpose of the case study is to Model the strength of high performance concrete using Machine Learning.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import itertools
%matplotlib inline
sns.set(style="ticks", color_codes=True)
from scipy.stats import zscore
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.preprocessing import PolynomialFeatures
from scipy.stats import uniform as sp_rand
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
concrete = pd.read_csv('concrete.csv')
# Shape of Data
concrete.shape
# Info
concrete.info()
concrete.head(15)
# Summary of Numeric Attributes
concrete.describe().transpose()
# Check missing values
concrete.isnull().sum()
# Check for data pollution - ensure all values are numeric
#concrete.applymap(lambda x: isinstance(x, (int, float))).all(1)
concrete[~concrete.applymap(lambda x: isinstance(x, (int, float))).all(1)]
# Data is all OK - All are numeric
from scipy.stats import pearsonr
# def corrfunc(x,y, ax=None, **kws):
# """Plot the correlation coefficient in the top left hand corner of a plot."""
# r, _ = pearsonr(x, y)
# ax = ax or plt.gca()
# # Unicode for lowercase rho (ρ)
# rho = '\u03C1'
# ax.annotate(f'{rho} = {r:.2f}', xy=(.1, .9), xycoords=ax.transAxes)
g = sns.pairplot(concrete, diag_kind='kde')
# g.map_lower(corrfunc)
# plt.show()
# Visualize individual variables
fig, ax_arr = plt.subplots(nrows = 9, ncols = 2, figsize = (30,60))
ind_vars = concrete.columns
plt_row=0
for col in ind_vars:
#print(col)
# plot boxplot with class as hue
sns.boxplot(y = concrete[col], data = concrete, orient = 'v', ax = ax_arr[plt_row, 0])
ax_arr[plt_row,0].set_ylabel(col, fontsize=20)
ax_arr[plt_row,0].set_title(col + ' Distribution', fontsize=20)
ax_arr[plt_row,0].tick_params(labelsize=20)
sns.distplot( concrete[col], ax = ax_arr[plt_row,1], rug=True, label= 'skew:' + str(concrete[col].skew()))
ax_arr[plt_row,1].axvline(concrete[col].mean(),linestyle="dashed",label="mean",color="k")
ax_arr[plt_row,1].set_title(col + ' Distribution', fontsize=20)
ax_arr[plt_row,1].legend(loc="best")
# sns.regplot(data=concrete, x=col, y='strength', ax = ax_arr[plt_row,1])
# ax_arr[plt_row,1].annotate(stats.pearsonr,
# xy=(.65, .03), xycoords=ax_arr[plt_row,1].transAxes, fontsize='xx-small')
#ax_arr[plt_row,1].annotate(stats.pearsonr, xy=(.1, .9))
# sns.relplot(x="strength", y=col, kind="line", ci="sd", data=concrete, ax = ax_arr[plt_row,1]);
#ax_arr[plt_row,2].set_title('Distribution of ' + col + ' for Strength with SD', fontsize=10)
plt_row+=1
plt.subplots_adjust(wspace=0.5)
plt.tight_layout()
plt.show()
# Interquartile Range Method
# calculate interquartile range
def get_outliers(col):
att= concrete[col]
q25, q75 = np.percentile(att, 25), np.percentile(att, 75)
iqr = q75 - q25
# calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off
# identify outliers
outliers = [x for x in att if x < lower or x > upper]
print(col, " IQR: ", lower," - ", upper)
print(col, " outliers:", outliers)
print("---------------------------------------------------------")
cols = ["age", "fineagg", "superplastic", "water", "slag"]
for col in cols:
get_outliers(col)
#sns.jointplot(col, "strength", data=concrete, kind="reg").annotate(stats.pearsonr)
#### Explore correlation with strength
cols = ["age", "fineagg", "superplastic", "water", "slag"]
for col in cols:
sns.jointplot(col, "strength", data=concrete, kind="reg").annotate(stats.pearsonr)
##### We will try log transformation of age feature to reduce the effect of outliers. Other techniques like binning or removing outliers lead to loss of information so we will not do it.
##### In addition, we will use feature regularization and other such models that are robust to outliers.
##### Explore Log Transformation of age
concrete["Log_age"] = concrete["age"].map(lambda i: np.log(i) if i > 0 else 0)
print(concrete['age'].skew())
print(concrete['Log_age'].skew())
sns.jointplot("Log_age", "strength", data=concrete, kind="reg").annotate(stats.pearsonr)
# Correlation with heat map
corr = concrete.corr()
print("Correlation with Target (Strength): \n", corr['strength'].sort_values(ascending=False))
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))
# create a mask so we only see the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr,mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)
modelResult = {}
def model(algorithm,dtrainx,dtrainy,dtestx,dtesty,of_type, m_name=""):
print (algorithm)
print ("***************************************************************************")
algorithm.fit(dtrainx,dtrainy)
pred = algorithm.predict(dtestx)
rmse = np.sqrt(mean_squared_error(dtesty,pred))
print ("ROOT MEAN SQUARED ERROR :", rmse)
print ("MEAN ABSOLUTE ERROR :", mean_absolute_error(dtesty,pred))
print ("***************************************************************************")
train_score = algorithm.score(dtrainx,dtrainy)
test_score = algorithm.score(dtestx,dtesty)
print ("TRAIN SCORE :", train_score)
print ("TEST SCORE :", test_score)
# print(dtesty[:].values.ravel())
# print(pred.ravel())
pr,p= stats.pearsonr(x=dtesty[:].values.ravel(), y=pred.ravel())
print ("PEARSONS COEFF for TEST :", pr,p)
print ("***************************************************************************")
prediction = pd.DataFrame(pred)
# neg_mean_absolute_error
# neg_mean_squared_error
cross_val = cross_val_score(algorithm,dtrainx,dtrainy,cv=20,scoring="neg_mean_squared_error")
cross_val = cross_val.ravel()
cv_mean = cross_val.mean()
print ("CROSS VALIDATION SCORE")
print ("************************")
print ("cv-mean :",cv_mean)
print ("cv-std :",cross_val.std())
print ("cv-max :",cross_val.max())
print ("cv-min :",cross_val.min())
with sns.axes_style("white"):
sns.jointplot(x=dtesty[:].values.ravel(), y=pred.ravel(), kind="reg", color="k").annotate(stats.pearsonr);
if (m_name):
modelResult[m_name]= {"Model":m_name, "RMSE":rmse, "Train Score":train_score, "Test Score":test_score, "Pearsonsr":pr, 'CV Mean':cv_mean}
plt.figure(figsize=(13,28))
plt.subplot(211)
testy = dtesty.reset_index()["strength"]
ax = testy.plot(label="originals",figsize=(12,13),linewidth=2)
ax = prediction[0].plot(label = "predictions",figsize=(12,13),linewidth=2)
# plt.axhline(testy.mean(),color = "r",linestyle="dashed",label=("original_mean:",testy.mean()))
# plt.axhline(prediction[0].mean(),color="b",linestyle = "dashed",label=("prediction_mean:",prediction[0].mean()))
plt.legend(loc="best")
plt.title("ORIGINALS VS PREDICTIONS")
plt.xlabel("index")
plt.ylabel("values")
ax.set_facecolor("k")
plt.subplot(212)
if of_type == "coef":
coef = pd.DataFrame(algorithm['rm'].coef_.ravel())
coef["feat"] = dtrainx.columns
ax1 = sns.barplot(coef["feat"],coef[0],palette="jet_r",
linewidth=2,edgecolor="k"*coef["feat"].nunique())
ax1.set_facecolor("lightgrey")
ax1.axhline(0,color="k",linewidth=2)
plt.ylabel("coefficients")
plt.xlabel("features")
plt.title('FEATURE IMPORTANCES')
elif of_type == "feat":
coef = pd.DataFrame(algorithm['rm'].feature_importances_)
coef["feat"] = dtrainx.columns
ax2 = sns.barplot(coef["feat"],coef[0],palette="jet_r",
linewidth=2,edgecolor="k"*coef["feat"].nunique())
ax2.set_facecolor("lightgrey")
ax2.axhline(0,color="k",linewidth=2)
plt.ylabel("coefficients")
plt.xlabel("features")
plt.title('FEATURE IMPORTANCES')
elif of_type == "polycoef":
coef = pd.DataFrame(algorithm['rm'].coef_.ravel())
print ("***************************************************************************")
print("Coefficents shape:", coef.shape)
print("Non-zero:", (coef!=0).sum())
print("Coefficents:", coef)
import warnings
warnings.filterwarnings("ignore")
# First with Original Age
X = concrete.drop(['strength', 'Log_age'], axis=1)
Y = concrete[['strength']]
# 30% of the data will be used for testing
test_size= 0.30
seed = 100
k_fold = KFold(n_splits=10, shuffle=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
from sklearn.linear_model import LinearRegression
# lr = LinearRegression()
pipe_lr = Pipeline([('scl', StandardScaler()), ('rm', LinearRegression())])
model(pipe_lr,x_train,y_train,x_test,y_test,"coef", "LR - Standard Scaler - Original Data")
from sklearn.ensemble import RandomForestRegressor
pipe_lr = Pipeline([('scl', StandardScaler()), ('rm', RandomForestRegressor())])
model(pipe_lr,x_train,y_train,x_test,y_test,"feat", "RandomForest - StandardScaler - Original Data")
modelResult
pipe_lr = Pipeline([('scl', StandardScaler()), ('rm', Lasso())])
model(pipe_lr,x_train,y_train,x_test,y_test,"coef", "Lasso - StandardScaler - Original Data")
pipe_lr = Pipeline([('scl', PowerTransformer()), ('rm', LinearRegression())])
model(pipe_lr,x_train,y_train,x_test,y_test,"coef", "Linear Regression - PowerTransformer - Original Data")
# ridge = Ridge(alpha=.3)
# ridge.fit(x_train,y_train)
# print ("Ridge model:", (ridge.coef_))
pipe_lr = Pipeline([('scl', PowerTransformer()), ('rm', Ridge(alpha=0.5))])
model(pipe_lr,x_train,y_train,x_test,y_test,"coef", "Ridge - PowerTransformer - Original Data")
pipe_lr = Pipeline([('scl', PowerTransformer()), ('rm', Lasso(alpha=0.1))])
# pipe_lr.fit(x_train,y_train)
# print ("Lasso model:", (lasso.coef_))
model(pipe_lr,x_train,y_train,x_test,y_test,"coef", "Lasso - PowerTransformer - Original Data")
pipe_lr = Pipeline([('scl', PowerTransformer()),('pf',PolynomialFeatures(degree = 2, interaction_only=True)), ('rm', LinearRegression())])
model(pipe_lr,x_train,y_train,x_test,y_test,"polycoef", "Linear Regression - PowerTransformer - Polynomial - Original Data")
pipe_lr = Pipeline([('scl', PowerTransformer()),('pf',PolynomialFeatures(degree = 2, interaction_only=True)), ('rm', Ridge(alpha=0.5))])
model(pipe_lr,x_train,y_train,x_test,y_test,"polycoef", "Ridge - PowerTransformer - Polynomial - Original Data")
pipe_lr = Pipeline([('scl', PowerTransformer()),('pf',PolynomialFeatures(degree = 2, interaction_only=True)), ('rm', Lasso(alpha=0.1))])
model(pipe_lr,x_train,y_train,x_test,y_test,"polycoef", "Lasso - PowerTransformer - Polynomial - Original Data")
# 1. Add ash, slag and superplastic to cement and then calculate ratio with fineagg(sand) and coarseagg
concrete['sum_cement'] = concrete['cement'] + concrete['ash'] + concrete['slag'] + concrete['superplastic']
concrete['solid_volume'] = concrete['sum_cement'] + concrete['fineagg'] + concrete['coarseagg']
con_r = pd.DataFrame()
con_r['cement_solid_ratio'] = concrete['sum_cement']/concrete['solid_volume']
con_r['fineagg_ratio'] = concrete['fineagg']/concrete['solid_volume']
con_r['coarseagg_ratio'] = concrete['coarseagg']/concrete['solid_volume']
# 2. Add ash, slag and superplastic to cement and then calculate ratio with water
concrete['cw_wt'] = concrete['sum_cement'] + concrete['water']
con_r['cement_cw_ratio'] = concrete['cement']/concrete['cw_wt']
con_r['water_cw_ratio'] = concrete['water']/concrete['cw_wt']
# 3. Add strength and age original columns
con_r['strength'] = concrete['strength']
con_r['age'] = concrete['age']
con_r.describe().transpose()
# Prepare a pair plot only for ratios
g = sns.pairplot(con_r, diag_kind='kde')
# Visualize individual variables
fig, ax_arr = plt.subplots(nrows = 6, ncols = 3, figsize = (30,60))
ind_vars = con_r.columns
plt_row=0
for col in ind_vars:
if (col!='age'):
sns.boxplot(y = con_r[col], data = con_r, orient = 'v', ax = ax_arr[plt_row, 0])
ax_arr[plt_row,0].set_ylabel(col, fontsize=20)
ax_arr[plt_row,0].set_title(col + ' Distribution', fontsize=20)
ax_arr[plt_row,0].tick_params(labelsize=20)
sns.distplot( con_r[col], ax = ax_arr[plt_row,1], rug=True, label= 'skew:' + "{:.3f}".format(con_r[col].skew()))
ax_arr[plt_row,1].axvline(con_r[col].mean(),linestyle="dashed",label="mean",color="k")
ax_arr[plt_row,1].set_title(col + ' Probability Distribution', fontsize=20)
ax_arr[plt_row,1].legend(loc="best",fontsize='30')
sns.residplot(col, 'strength', data=con_r, ax= ax_arr[plt_row,2])
ax_arr[plt_row,2].set_title(col + ' Residual', fontsize=20)
plt_row+=1
plt.subplots_adjust(wspace=0.5)
plt.tight_layout()
plt.show()
# Correlation with heat map
corr = con_r.corr()
print("Correlation with Target (Strength): \n", corr['strength'].sort_values(ascending=False))
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))
# create a mask so we only see the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr,mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)
# con_r['sum_cement'] = concrete['sum_cement'].values
# con_r['solid_volume'] = concrete['solid_volume'].values
# con_r['cw_wt'] = concrete['cw_wt'].values
X = con_r.drop(['strength'], axis=1)
Y = con_r[['strength']]
print(con_r.head())
# 30% of the data will be used for testing
test_size= 0.30
seed = 100
k_fold = KFold(n_splits=10, shuffle=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# lr = LinearRegression()
pipe_lr = Pipeline([('scl', StandardScaler()), ('rm', LinearRegression())])
model(pipe_lr,x_train,y_train,x_test,y_test,"coef", "Linear Regression - Standard Scaler - Modified Data")
# Using Box-Cox Transform
pipe_lr = Pipeline([('pt', PowerTransformer(method='box-cox')),('rm', LinearRegression())])
model(pipe_lr,x_train,y_train,x_test,y_test,"coef", "Linear Regression - PowerTransformer - Modified Data")
# The low score is due to the apparent mix of gaussians
# Let us explore the data for hidden clusters
cluster_range = range( 2, 10 ) # expect 3 to 4 clusters from the pair panel visual inspection hence restricting from 2 to 10
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, n_init = 5)
clusters.fit(con_r)
labels = clusters.labels_
centroids = clusters.cluster_centers_
cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:15]
# ridge = Ridge(alpha=.3)
# model(ridge,x_train,y_train,x_test,y_test,"coef")
# Elbow plot
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
# Lets use 5 clusters
cc_df_attr = con_r.loc[:,:]
cc_df_attr_z = cc_df_attr.apply(zscore)
cluster = KMeans( n_clusters = 5, random_state = 2354 )
cluster.fit(cc_df_attr_z)
prediction=cluster.predict(cc_df_attr_z)
cc_df_attr_z["GROUP"] = prediction # Creating a new column "GROUP" which will hold the cluster id of each record
cc_df_attr_z_copy = cc_df_attr_z.copy(deep = True) # Creating a mirror copy for later re-use instead of building repeatedly
with sns.axes_style("white"):
plot = sns.lmplot("age",'strength',data=cc_df_attr_z,hue='GROUP')
plot.set(ylim = (-3,3))
with sns.axes_style("white"):
plot = sns.lmplot("cement_solid_ratio",'strength',data=cc_df_attr_z,hue='GROUP')
plot.set(ylim = (-3,3))
with sns.axes_style("white"):
plot = sns.lmplot("cement_cw_ratio",'strength',data=cc_df_attr_z,hue='GROUP')
plot.set(ylim = (-3,3))
# get unique age values of each cluster
print(con_r['age'].value_counts(normalize=True).sort_index())
# Let's try see the % of data we'd have to work with for different cutoffs
cutoffs = [28, 150]
for cutoff in cutoffs:
print('{0:.1f}% with age > {1}'.format(100.0 * con_r[ con_r.age > cutoff ].shape[0] / con_r.shape[0], cutoff))
# Lets model with these three clusters
# 0-28
# 29-200
# >200
cond = [con_r['age']<28, (con_r['age']>28) & (con_r['age']<=150), con_r['age'] >150 ]
for c in cond:
conr_temp = con_r[c]
X = conr_temp.drop(['strength'], axis=1)
Y = conr_temp[['strength']]
# 30% of the data will be used for testing
test_size= 0.30
seed = 100
k_fold = KFold(n_splits=10, shuffle=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
pipe_lr = Pipeline([('pt', PowerTransformer(method='box-cox')),('rm', LinearRegression())])
model(pipe_lr,x_train,y_train,x_test,y_test,"coef")
X = con_r.drop(['strength'], axis=1)
Y = con_r[['strength']]
# 30% of the data will be used for testing
test_size= 0.30
seed = 100
k_fold = KFold(n_splits=10, shuffle=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
pipeline = Pipeline(steps=[('pt', PowerTransformer()),('pf',PolynomialFeatures(degree = 2, interaction_only=True)), ('rm', Lasso(alpha=0.1))])
model(pipeline,x_train,y_train,x_test,y_test,"polycoef", "Lasso - PowerTransformer - Polynomial - Modified Data")
X = con_r.drop(['strength'], axis=1)
Y = con_r[['strength']].apply(zscore)
# 30% of the data will be used for testing
test_size= 0.30
seed = 100
k_fold = KFold(n_splits=10, shuffle=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
pipeline = Pipeline(steps=[('pt', PowerTransformer()),('pf',PolynomialFeatures(degree = 2, interaction_only=True)), ('rm', Lasso(alpha=0.1))])
model(pipeline,x_train,y_train,x_test,y_test,"polycoef")
## There are several variables that are highly correlated with each other
#Run PCA and plot to visualise the ideal number of components
#print(x_train)
# transform just the age factor to zscore
sc_x_train = x_train.loc[:,'cement_solid_ratio':'water_cw_ratio']
sc_x_train["sc_age"] = x_train[["age"]].apply(zscore)
#sc_x_train.drop("age")
#print(sc_x_train)
pca = PCA().fit(x_train)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
print("Explained Variance Ratio :/n", np.cumsum(pca.explained_variance_ratio_))
# Looks like just one component captures most of the details - Interesting. Lets take it for a spin.
# Based on the plot, we will select 2 components that explain more than 99% of the variance
# Something doesnt feel right
# @Moderator: Sir, Please could you help me understand this.
pca = PCA(n_components=2)
pca.fit(sc_x_train)
#Assign the components to the X variable
sc_x_test = x_test.loc[:,'cement_solid_ratio':'water_cw_ratio']
sc_x_test["sc_age"] = x_test[["age"]].apply(zscore)
pca_x_train = pca.transform(sc_x_train)
pca_x_test = pca.transform(sc_x_test)
# Lets send this data to the linear pipeline since the data is already scaled.
# lr = LinearRegression()
pipe_lr = Pipeline([('rm', LinearRegression())])
model(pipe_lr,pca_x_train,y_train,pca_x_test,y_test,"")
# First with Original Age
#X = concrete.drop(['strength', 'Log_age', 'sum_cement', 'solid_volume', 'cw_wt'], axis=1)
X = concrete.drop(['strength', 'Log_age'], axis=1)
Y = concrete[['strength']]
print(X.head())
# 30% of the data will be used for testing
test_size= 0.30
seed = 100
k_fold = KFold(n_splits=10, shuffle=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
pipeline = Pipeline(steps=[('pt', PowerTransformer()),('pf',PolynomialFeatures(degree = 2, interaction_only=True)), ('rm', Lasso(alpha=0.1))])
model(pipeline,x_train,y_train,x_test,y_test,"polycoef")
# higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
# restricted and in this case linear and ridge regression resembles
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'rm__alpha': [0.0001,0.001,0.005, 0.002,0.01,0.05,0.09,0.1,0.5,0.9,1.0,1.5,2, 2,5.5, 10, 100, 200]}
# create and fit a ridge regression model, testing random alpha values
pipe_lr = Pipeline([('scl', PowerTransformer()),('pf',PolynomialFeatures(degree = 2, interaction_only=True)), ('rm', Lasso(alpha=0.1))])
rsearch = GridSearchCV(pipe_lr, param_grid=param_grid, cv=10, scoring='neg_mean_absolute_error', n_jobs=4)
rsearch.fit(x_train,y_train)
print(rsearch)
print(rsearch.best_score_)
print(rsearch.best_estimator_["rm"].alpha)
pipeline = Pipeline(steps=[('pt', PowerTransformer()),('pf',PolynomialFeatures(degree = 2, interaction_only=True)), ('rm', Lasso(alpha=0.05))])
model(pipeline,x_train,y_train,x_test,y_test,"polycoef", "Lasso - PowerTransformer - Polynomial - TUNED - Original Data")
from sklearn.tree import DecisionTreeRegressor
#pipeline = Pipeline(steps=[('pt', PowerTransformer()),('pf',PolynomialFeatures(degree = 2, interaction_only=True)), ('rm', DecisionTreeRegressor())])
pipeline = Pipeline(steps=[('pt', PowerTransformer()), ('rm', DecisionTreeRegressor())])
model(pipeline,x_train,y_train,x_test,y_test,"feat", "DecisionTree - PowerTransformer - Original Data")
from sklearn.ensemble import RandomForestRegressor
pipeline = Pipeline(steps=[('pt', PowerTransformer()), ('rm', RandomForestRegressor())])
model(pipeline,x_train,y_train,x_test,y_test,"feat", "RandomForest - PowerTransformer - Original Data")
from sklearn.ensemble import GradientBoostingRegressor
pipeline = Pipeline(steps=[('pt', PowerTransformer()), ('rm', GradientBoostingRegressor())])
model(pipeline,x_train,y_train,x_test,y_test,"feat", "GradientBoosting - PowerTransformer - Original Data")
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
pipeline = Pipeline(steps=[ ('rm', XGBRegressor())])
model(pipeline,x_train,y_train,x_test,y_test,"feat", "XGBRegressor - Original Data")
dfObj = pd.DataFrame(modelResult).transpose()
df_u = dfObj.drop(["Model"],axis=1)
df_u.sort_values(by=['Pearsonsr', 'RMSE'], ascending=False)
## RandomForest is overfitting. Lets try to prune and hypertune it.
pipeline = Pipeline(steps=[('pt', StandardScaler()), ('rm', RandomForestRegressor())])
param_grid = {'rm__max_depth': np.arange(3, 8),
'rm__criterion' : ['mse','mae'],
'rm__max_leaf_nodes': [100,105, 90,95],
'rm__min_samples_split': [6,7,8,9,10],
'rm__max_features':['auto','sqrt','log2']}
grid_tree = GridSearchCV(pipeline, param_grid, cv = 5, scoring= 'r2', verbose=5, n_jobs=4)
grid_tree.fit(x_train, y_train)
# print(grid_tree.best_estimator_["rm"])
print("Best Params: \n",grid_tree.best_params_)
print("Best Score: \n",np.abs(grid_tree.best_score_))
grid_tree.best_params_
pipeline = Pipeline(steps=[('pt', StandardScaler()), ('rm', RandomForestRegressor(criterion='mse', max_depth=7, max_features='auto', max_leaf_nodes=90, min_samples_split=6))])
model(pipeline,x_train,y_train,x_test,y_test,"feat", "")
pipeline = Pipeline(steps=[('pt', PowerTransformer()), ('rm', RandomForestRegressor())])
param_grid = {'rm__max_depth': np.arange(3, 8),
'rm__criterion' : ['mse','mae'],
'rm__max_leaf_nodes': [100,105, 90,95],
'rm__min_samples_split': [6,7,8,9,10],
'rm__max_features':['auto','sqrt','log2']}
grid_tree = GridSearchCV(pipeline, param_grid, cv = 5, scoring= 'r2', verbose=5, n_jobs=4)
grid_tree.fit(x_train, y_train)
print("Best Params: \n",grid_tree.best_params_)
print("Best Score: \n",np.abs(grid_tree.best_score_))
pipeline = Pipeline(steps=[('pt', PowerTransformer()), ('rm', RandomForestRegressor(criterion='mse', max_depth=7, max_features='auto', max_leaf_nodes=10, min_samples_split=6))])
model(pipeline,x_train,y_train,x_test,y_test,"feat", "")
param_grid = {'n_estimators':range(0,500,50),
'max_depth':range(5,16,2), #range(5,16,2),
'min_samples_split':range(0,1001,200), #range(200,1001,200),
'learning_rate':[0.2]}
clf = RandomizedSearchCV(XGBRegressor(random_state=1),
param_distributions = param_grid, scoring='r2',
cv=10,n_jobs=-1, n_iter=1000, verbose=4).fit(x_train, y_train)
print(clf.best_estimator_)
print(clf.best_params_)
print("R Squared:",clf.best_score_)
#pipeline = Pipeline(steps=[ ('rm', XGBRegressor())])
#model(pipeline,x_train,y_train,x_test,y_test,"feat", "XGBRegressor - Original Data")
%%time
param_grid = {'n_estimators':range(380,421,10),
'max_depth':range(3,7,1),
'min_samples_split':range(0,100,30), #range(200,1001,200),
'learning_rate':[0.1, 0.2, 0.3],
'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]}
clf = RandomizedSearchCV(XGBRegressor(random_state=1),
param_distributions = param_grid, scoring='r2',
cv=10,n_jobs=-1, n_iter=1000, verbose=4).fit(x_train, y_train)
print(clf.best_estimator_)
print(clf.best_params_)
print("R Squared:",clf.best_score_)
pipeline = Pipeline(steps=[ ('rm', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
importance_type='gain', learning_rate=0.2, max_delta_step=0,
max_depth=4, min_child_weight=1, min_samples_split=30,
missing=None, n_estimators=420, n_jobs=1, nthread=None,
objective='reg:linear', random_state=1, reg_alpha=0.001,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
subsample=1, verbosity=1))])
model(pipeline,x_train,y_train,x_test,y_test,"feat", "")
# binomial confidence Level
from math import sqrt
test_size=len(y_test)
accuracy=0.9625
interval = 1.96 * sqrt( (accuracy * (1 - accuracy)) / test_size)*100
print('%.3f' % interval, '%')